# Dr. M. Baron, Statistical Machine Learning class,
STAT-427/627
# First steps in R.
Variables, summary, folders, data sets
# Vectors and simple operations
> x <- c(1,3,5,6)
# Create a vector (c means concatenate)
> x = c(1,3,5,6) # Another way to define a vector
> x
[1] 1 3 5 6
> x[2] # Get the 2nd element of vector x
[1] 3
> x[2:4] # Get all elements of x from the 2nd to the 4th
[1] 3 5 6
> x = rnorm(10000,2,100) # Generate a vector of 10,000 Normal random variables
# with mean 2 and st.
deviation 100
# Basic statistics
> mean(x)
[1] 2.379067
> sd(x)
[1] 100.0676
# Arithmetic operations
> x = c(1,3,5,7,0,-1)
> x
[1] 1 3
5 7 0 -1
> x^2
[1] 1 9 25 49
0 1
> sin(x)
[1] 0.8414710 0.1411200 -0.9589243 0.6569866
0.0000000 -0.8414710
> log(x)
[1] 0.000000 1.098612 1.609438 1.945910 -Inf
NaN
Warning message:
In log(x) : NaNs produced
# Define a matrix A based on a vector x
> A = matrix(x,2,3)
> A
[,1] [,2] [,3]
[1,] 1 5
0
[2,] 3 7
-1
# READING DATA FROM EXTERNAL FILES
# To point to the right folder, go "File" ->
"Change dir..." or use the setwd command
# Which folder is R pointed to right now?
> getwd()
[1] "C:/Users/baron/Documents"
# Let's change the folder to the one where we have data. Notice slashes.
> setwd("C:/Users/baron/627 Statistical Machine
Learning/data")
# Use read.table("file.txt") to
read text files
# Rda and Rdata
files should be opened with load("file.rda")
> load("Auto.rda")
# Find out what variables are in the set
> dim(Auto)
[1] 392 9
> names(Auto)
[1] "mpg"
"cylinders"
"displacement"
[4] "horsepower"
"weight"
"acceleration"
[7] "year"
"origin"
"name"
> summary(Auto)
mpg cylinders displacement
Min. : 9.00
Min. :3.000 Min.
: 68.0
1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0
Median :22.75 Median :4.000 Median :151.0
Mean :23.45
Mean :5.472 Mean
:194.4
3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8
Max. :46.60
Max. :8.000 Max.
:455.0
horsepower weight acceleration
Min. : 46.0
Min. :1613 Min.
: 8.00
1st Qu.: 75.0 1st Qu.:2225 1st Qu.:13.78
Median : 93.5 Median :2804 Median :15.50
Mean :104.5
Mean :2978 Mean
:15.54
3rd Qu.:126.0 3rd Qu.:3615 3rd Qu.:17.02
Max. :230.0
Max. :5140 Max.
:24.80
year origin name
Min. :70.00
Min. :1.000 amc matador :
5
1st Qu.:73.00 1st Qu.:1.000 ford pinto :
5
Median :76.00 Median :1.000 toyota
corolla : 5
Mean :75.98
Mean :1.577 amc gremlin :
4
3rd Qu.:79.00 3rd Qu.:2.000 amc hornet :
4
Max. :82.00
Max. :3.000 (Other) :365
# Look at the data as a spreadsheet
> fix(Auto)
# Refer to the particular variable in this dataset with $ sign...
> Auto$name
[1] chevrolet
chevelle malibu
[2] buick
skylark 320
[3] plymouth
satellite
[4] amc
rebel sst
[5] ford torino
< truncated >
# or attach it the dataset that you plan to work with...
> attach(Auto)
> name
[1] chevrolet
chevelle malibu
[2] buick
skylark 320
[3] plymouth
satellite
[4] amc
rebel sst
[5] ford torino
< truncated >
# Descriptive statistics: mean and the 5-number summary
> mean(mpg)
[1] 23.44592
> summary(mpg)
Min. 1st Qu. Median
Mean 3rd Qu. Max.
9.00 17.00
22.75 23.45 29.00
46.60
# PLOTS.
# Before you do anything with the data, look at them.
> plot(weight,mpg)
> plot(cylinders,mpg)
# Perhaps, we should treat “cylinders” is a categorical variable?
> cyl = as.factor(cylinders)
> plot(cyl,mpg) # When one variable is categorical, we get
boxplots of the other variable
# Axis labels, graph title, color
> plot(weight, mpg, xlab="Weight", ylab="MPG",
main="Plot of
Miles per Gallon", col="blue")
# SCATTERPLOT MATRIX #
# Use it to plot more than 2 variables.
# First, partition the graphing window into a matrix
> par(mfrow=c(4,4))
# Then fill each non-diagonal space with the corresponding scatterplot
> pairs(~mpg+weight+horsepower+year)
# Saving a graph in a file
> pdf("filename.pdf")
> plot(weight, mpg, xlab="Weight",
ylab="MPG", col="blue")
> dev.off()
windows
2
# Finish and quit R
> q()